
/* Video ouput stage */

#include <assert.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "oslib/os.h"
#include "oslib/wimp.h"

#include "cpu.h"
#include "dma.h"
#include "defs.h"
#include "log.h"
#include "pci.h"
#include "utils.h"
#include "video_out.h"

#define SCREEN_ID 0

#ifdef USE_DMA

bool gfxcard_pc_rgb = false;

static bool dma_available = false;
static bool got_vrambase = false;
unsigned vram_base;

/* exported properties of desktop screen mode */
byte *screen_base = NULL;
int screen_width;
int screen_height;
int screen_line_length;
int screen_lg2bpp;
int xeig, yeig;

static const os_VDU_VAR_LIST(8) vars =
{{ os_VDUVAR_DISPLAY_START, os_MODEVAR_LINE_LENGTH, os_MODEVAR_LOG2_BPP,
   os_MODEVAR_XEIG_FACTOR, os_MODEVAR_YEIG_FACTOR,
   os_MODEVAR_XWIND_LIMIT, os_MODEVAR_YWIND_LIMIT, -1 }};


#define ALLOC_DMA_DESCS 2048

static dma_chain_desc *dma_free = NULL;


static int page_size;


os_error *output_init(void)
{
  os_error *err = xos_read_mem_map_info(&page_size, NULL);
  if (!err) err = output_get_screen_info();
  if (!err)
  {
    /* discover the physical address of the graphics memory using the slightly
       underhand technique of scanning the PCI devices for an NVidia card;
       OS_Memory won't simply give us the screen memory's physical address
       because it only translates addresses in the main system RAM */

    pci_function fn = 0;
    int id = 0;
    while (id++ <= SCREEN_ID && !err)
    {
      /* look for all NVidia devices and filter out any that are known not to work */
      err = xpci_find_by_id(0x10de, -1, -1, fn, -1, &fn);
    }

    if (!err)
    {
      if (fn)
      {
        /* should be BAR1 but check the range just in case */
        pci_id id;
        bits size;
        if (!xpci_read_id(fn, &id, NULL))
          gfxcard_pc_rgb = (id >= 0x02000000); /* pretty arbitrary threshold */

        err = xpci_hardware_address(0x80000000U, 1, fn, NULL, &vram_base, &size, NULL);
        if (!err)
        {
          if (size > 0x200000)
          {
            got_vrambase = true;
          }
          else
            err = (os_error*)"\0\0\0\0Not enough VRAM";//lookup_error(error_NOT_ENOUGH_VRAM);
        }

//??? for now
        dma_available = true;
      }
      else
        err = (os_error*)"\0\0\0\0Cannot find graphics card"; //lookup_error(error_CANT_FIND_GCARD);
    }
#ifdef ARM9_BUILD
    else
    {
      vram_base = 0x10000000;
      got_vrambase = true;
      dma_available = true;
    }
#endif

//    if (!got_vrambase)
//      log_error(lookup_error1(error_CANT_FIND_VRAM, err->errmess));

    return NULL;
  }
  return err;
}

os_error *output_get_screen_info(void)
{
  os_error *err;
  int vals[8];

  err = xos_read_vdu_variables((os_vdu_var_list*)vars.var, vals);
  if (!err)
  {
    screen_base = (unsigned char*)vals[0];
    screen_line_length = vals[1];
    screen_lg2bpp = vals[2];
    xeig = vals[3];
    yeig = vals[4];
    screen_width  = vals[5] + 1;
    screen_height = vals[6] + 1;
  }
  return err;
}

void release_dma_descs(dma_chain_desc **pd)
{
  dma_chain_desc *d = *pd;
  if (d)
  {
    while (d->next) d = d->next;
    d->next = dma_free;
    dma_free = *pd;
    *pd = NULL;
  }
}

os_error *get_dma_desc(dma_chain_desc **ppd)
{
  dma_chain_desc *pd = dma_free;
//log_printf("get desc %p\n", dma_free);
  if (pd)
    dma_free = pd->next;
  else
  {
    void *blk;
    int i;

    blk = malloc(28 + (ALLOC_DMA_DESCS * sizeof(dma_chain_desc)));
    if (!blk) return (os_error*)"\0\0\0\0Not enough memory";
//log_printf("alloced block at %p\n", blk);

    /* DMA chain descriptors /must/ be 32-byte aligned because the IOP321 assumes this */
    pd = (dma_chain_desc*)(((int)blk + 31) & ~31);
    for(i = 1; i < ALLOC_DMA_DESCS-1; i++)
      pd[i].next = &pd[i+1];
    pd[ALLOC_DMA_DESCS-1].next = NULL;
    dma_free = &pd[1];
  }
  *ppd = pd;
  return NULL;
}

// In general we're DMAing a rectangular region of the source image to a rectangular region of the display surface
// with the widths of the two regions being equal but the heights being different (linerep/subsample)

// note that here we need a number of descriptors == the product of source images and display banks
//
os_error *get_addresses(byte *d, const byte *s, dma_chain_desc **ppd, byte **pppd, os_box *sr, os_box *dr,
         size_t d_linelen, size_t s_linelen, bool do_source)
{
  int width = sr->x1 - sr->x0, width_bytes = (width << screen_lg2bpp) / 8;
  dma_chain_desc *pdmin = NULL, *pdmax = NULL;
  int src_height = sr->y1 - sr->y0;
  int y, rows = dr->y1 - dr->y0;
  os_page_block pageblk[2];  //??? SORT IT OUT!
  bool single = false;
  unsigned phys_base = vram_base;
  os_error *err;

//log_printf("GETADDR %d\n", two_screens);

//??? YUM! YUM!
phys_base += (d - screen_base);

  /* cannot scale horizontally using DMA, only vertically */
  assert(width == dr->x1 - dr->x0);

  for (y = 0; y < rows; y++)
  {
    unsigned pdst;
    byte *sp, *esp;

    pdst = phys_base + ((dr->y0 + y) * d_linelen) + ((dr->x0 << screen_lg2bpp) / 8);

    /* select source scanline */
    sp = (byte*)s + (s_linelen * (sr->y0 + (src_height * y) / rows)) + ((sr->x0 << screen_lg2bpp) / 8);
    esp = sp + (single ? (width_bytes * src_height) : width_bytes);

//??? we have two almost completely different loops interwined here!

    /* get list of physically contiguous regions of this source scanline */
    while (sp < esp)
    {
      unsigned chunk_size;
      dma_chain_desc *pd;
      unsigned psrc;

      chunk_size = page_size - ((int)sp & (page_size-1));

      /* allocate a DMA chain descriptor */
      if (do_source)
      {
        err = get_dma_desc(&pd);
        if (err)
        {
/*??? release all the descriptors that we've allocated */
          return err;
        }
        /* all DMA chain descriptors must be 32-byte aligned */
        assert(!((int)pd & 31));
  
        /* build links from previous DMA chain descriptor */
        pageblk[0].log_addr = (byte*)pd;
        err = xosmemory_page_op(osmemory_GIVEN_LOG_ADDR | osmemory_RETURN_PHYS_ADDR, pageblk, 1);
        if (err) return err;

        /* append to linked lists */
//log_printf("ppd = %.8X\n", (int)ppd);
//log_printf("pppd = %.8X\n", (int)pppd);
        *ppd  = pd;
        *pppd = pageblk[0].phys_addr;

        /* get physical start address of this region */
//printf("Chunk starts at %p\n", sp);
        pageblk[0].log_addr = sp;
        err = xosmemory_page_op(osmemory_GIVEN_LOG_ADDR | osmemory_RETURN_PHYS_ADDR, pageblk, 1);
        if (err) return err;
        psrc = (unsigned)pageblk[0].phys_addr;
  
        pd->nda  = 0;
        pd->pad  = pdst;
        pd->puad = 0;
        pd->lad  = psrc;
        pd->next = NULL;
      }
      else
      {
        /* we reuse the existing descriptors because we only need to change the destination address */
        pd = *ppd;
        assert(pd);
        ppd = &pd->next;

        psrc = pd->lad;

        pd->pad = pdst;
      }

      if (pdmin)
      {
        if (pd < pdmin) pdmin = pd;
        if (pd > pdmax) pdmax = pd;
      }
      else
        pdmin = pdmax = pd;
  
      if (do_source)
      {
//??? check contiguity using descriptors if reusing!
        sp += chunk_size;
        psrc += chunk_size;

        while (sp < esp)
        {
          pageblk[0].log_addr = sp;
          err = xosmemory_page_op(osmemory_GIVEN_LOG_ADDR | osmemory_RETURN_PHYS_ADDR, pageblk, 1);
          if (err) return err;
//          printf(" phys %.8X\n", pageblk[0].phys_addr);

          if ((unsigned)pageblk[0].phys_addr != psrc) break;
     
          /* okay, we can include this page too */
          chunk_size += page_size;
     
          /* now, let's try for another */
          psrc += page_size;
          sp += page_size;
        }

        if (sp > esp)
          chunk_size -= (sp - esp);

        /* okay we've got as many physically contiguous pages as we can */
//???      assert(chunk_size <= 1024);
//         appears to work anyway, though the datasheet suggests that it can't (a bit ambiguous?)
        pd->bc = chunk_size;
        pd->dc = 0x0F;  //??? this mayn't be the fastest setting?

        ppd = &pd->next;
        pppd = (byte**)&pd->nda;
//log_printf("set ppd %.8X %.8X\n", (int)ppd, (int)pppd);
      }
      else
      {
        chunk_size = pd->bc;
        sp += chunk_size;
      }

      pdst += chunk_size;
    }
  }

//  if (err)

  if (pdmin)
  {
    /* ensure that our DMA chain descriptors have been written out to the SDRAM
       (the DMA controller can't see the CPU caches) */

    clean_Dcache(os_GIVEN_CODE_RANGE, pdmin, (byte*)(pdmax+1)-4);
  }

  return NULL;
}

#endif
